GitHub

# load generated data
hour_data <- read_csv("hour_data.csv")

# The grouped data does not have the original date column, but it can be nice for visualizations. There is a function to do this in times.R
hour_data_date <- add_date_column(hour_data)

Splitting the data into training and testing

We are performing exploratory data analysis on the training data only.

set.seed(28021995)

# create a split object
data_split <- initial_split(hour_data_date, prop = 0.8)

# create the training and testing data
train_data_date <- training(x = data_split)
test_data_date  <- testing(x = data_split)

Summary stats

# summary entire train dataset
summary(train_data_date)
##       year         month                day             hour      
##  Min.   :2020   Length:9944        Min.   : 1.00   Min.   : 0.00  
##  1st Qu.:2020   Class :character   1st Qu.: 8.00   1st Qu.: 6.00  
##  Median :2021   Mode  :character   Median :16.00   Median :12.00  
##  Mean   :2021                      Mean   :15.71   Mean   :11.53  
##  3rd Qu.:2021                      3rd Qu.:23.00   3rd Qu.:18.00  
##  Max.   :2021                      Max.   :31.00   Max.   :23.00  
##                                                                   
##  20th_&_e_st_nw_arrivals new_hampshire_ave_&_24th_st_nw_arrivals
##  Min.   : 0.0000         Min.   : 0.000                         
##  1st Qu.: 0.0000         1st Qu.: 0.000                         
##  Median : 0.0000         Median : 1.000                         
##  Mean   : 0.5535         Mean   : 1.282                         
##  3rd Qu.: 1.0000         3rd Qu.: 2.000                         
##  Max.   :20.0000         Max.   :13.000                         
##                                                                 
##  15th_st_&_constitution_ave_nw_departures 21st_&_i_st_nw_arrivals
##  Min.   : 0.000                           Min.   : 0.000         
##  1st Qu.: 0.000                           1st Qu.: 0.000         
##  Median : 0.000                           Median : 0.000         
##  Mean   : 1.786                           Mean   : 0.774         
##  3rd Qu.: 2.000                           3rd Qu.: 1.000         
##  Max.   :36.000                           Max.   :23.000         
##                                                                  
##  17th_&_g_st_nw_arrivals 19th_st_&_constitution_ave_nw_departures
##  Min.   : 0.0000         Min.   : 0.0000                         
##  1st Qu.: 0.0000         1st Qu.: 0.0000                         
##  Median : 0.0000         Median : 0.0000                         
##  Mean   : 0.8188         Mean   : 0.7992                         
##  3rd Qu.: 1.0000         3rd Qu.: 1.0000                         
##  Max.   :21.0000         Max.   :33.0000                         
##                                                                  
##  23rd_&_e_st_nw_arrivals henry_bacon_dr_&_lincoln_memorial_circle_nw_arrivals
##  Min.   : 0.0000         Min.   : 0.000                                      
##  1st Qu.: 0.0000         1st Qu.: 0.000                                      
##  Median : 0.0000         Median : 1.000                                      
##  Mean   : 0.6858         Mean   : 2.617                                      
##  3rd Qu.: 1.0000         3rd Qu.: 4.000                                      
##  Max.   :14.0000         Max.   :47.000                                      
##                                                                              
##  henry_bacon_dr_&_lincoln_memorial_circle_nw_departures
##  Min.   : 0.000                                        
##  1st Qu.: 0.000                                        
##  Median : 1.000                                        
##  Mean   : 2.629                                        
##  3rd Qu.: 4.000                                        
##  Max.   :46.000                                        
##                                                        
##  jefferson_dr_&_14th_st_sw_departures jefferson_memorial_arrivals
##  Min.   : 0.000                       Min.   : 0.000             
##  1st Qu.: 0.000                       1st Qu.: 0.000             
##  Median : 1.000                       Median : 0.000             
##  Mean   : 2.928                       Mean   : 1.788             
##  3rd Qu.: 4.000                       3rd Qu.: 2.000             
##  Max.   :44.000                       Max.   :40.000             
##                                                                  
##  jefferson_memorial_departures 18th_st_&_pennsylvania_ave_nw_arrivals
##  Min.   : 0.000                Min.   : 0.0000                       
##  1st Qu.: 0.000                1st Qu.: 0.0000                       
##  Median : 0.000                Median : 0.0000                       
##  Mean   : 1.798                Mean   : 0.5459                       
##  3rd Qu.: 2.000                3rd Qu.: 1.0000                       
##  Max.   :42.000                Max.   :19.0000                       
##                                                                      
##  19th_st_&_constitution_ave_nw_arrivals
##  Min.   : 0.0000                       
##  1st Qu.: 0.0000                       
##  Median : 0.0000                       
##  Mean   : 0.8371                       
##  3rd Qu.: 1.0000                       
##  Max.   :33.0000                       
##                                        
##  us_dept_of_state_/_virginia_ave_&_21st_st_nw_arrivals
##  Min.   : 0.0000                                      
##  1st Qu.: 0.0000                                      
##  Median : 0.0000                                      
##  Mean   : 0.4419                                      
##  3rd Qu.: 0.0000                                      
##  Max.   :16.0000                                      
##                                                       
##  virginia_ave_&_25th_st_nw_arrivals virginia_ave_&_25th_st_nw_departures
##  Min.   : 0.0000                    Min.   : 0.0000                     
##  1st Qu.: 0.0000                    1st Qu.: 0.0000                     
##  Median : 0.0000                    Median : 0.0000                     
##  Mean   : 0.6104                    Mean   : 0.5786                     
##  3rd Qu.: 1.0000                    3rd Qu.: 1.0000                     
##  Max.   :12.0000                    Max.   :11.0000                     
##                                                                         
##  19th_&_g_st_nw_arrivals 19th_st_&_pennsylvania_ave_nw_departures
##  Min.   : 0.0000         Min.   :0.0000                          
##  1st Qu.: 0.0000         1st Qu.:0.0000                          
##  Median : 0.0000         Median :0.0000                          
##  Mean   : 0.2494         Mean   :0.3012                          
##  3rd Qu.: 0.0000         3rd Qu.:0.0000                          
##  Max.   :11.0000         Max.   :9.0000                          
##                                                                  
##  22nd_st_&_constitution_ave_nw_arrivals 22nd_&_h_st_nw_arrivals
##  Min.   : 0.000                         Min.   : 0.0000        
##  1st Qu.: 0.000                         1st Qu.: 0.0000        
##  Median : 0.000                         Median : 0.0000        
##  Mean   : 0.556                         Mean   : 0.5047        
##  3rd Qu.: 1.000                         3rd Qu.: 1.0000        
##  Max.   :37.000                         Max.   :11.0000        
##                                                                
##  22nd_&_h_st_nw_departures 17th_&_g_st_nw_departures 19th_&_g_st_nw_departures
##  Min.   : 0.0000           Min.   : 0.000            Min.   : 0.0000          
##  1st Qu.: 0.0000           1st Qu.: 0.000            1st Qu.: 0.0000          
##  Median : 0.0000           Median : 0.000            Median : 0.0000          
##  Mean   : 0.4967           Mean   : 0.761            Mean   : 0.2508          
##  3rd Qu.: 1.0000           3rd Qu.: 1.000            3rd Qu.: 0.0000          
##  Max.   :12.0000           Max.   :29.000            Max.   :15.0000          
##                                                                               
##  21st_st_&_constitution_ave_nw_arrivals
##  Min.   : 0.0000                       
##  1st Qu.: 0.0000                       
##  Median : 0.0000                       
##  Mean   : 0.4718                       
##  3rd Qu.: 0.0000                       
##  Max.   :18.0000                       
##                                        
##  21st_st_&_constitution_ave_nw_departures
##  Min.   : 0.0000                         
##  1st Qu.: 0.0000                         
##  Median : 0.0000                         
##  Mean   : 0.4658                         
##  3rd Qu.: 0.0000                         
##  Max.   :17.0000                         
##                                          
##  21st_st_&_pennsylvania_ave_nw_arrivals
##  Min.   : 0.0000                       
##  1st Qu.: 0.0000                       
##  Median : 0.0000                       
##  Mean   : 0.4555                       
##  3rd Qu.: 1.0000                       
##  Max.   :10.0000                       
##                                        
##  21st_st_&_pennsylvania_ave_nw_departures
##  Min.   : 0.0000                         
##  1st Qu.: 0.0000                         
##  Median : 0.0000                         
##  Mean   : 0.3819                         
##  3rd Qu.: 0.0000                         
##  Max.   :12.0000                         
##                                          
##  22nd_&_i_st_nw_/_foggy_bottom_arrivals
##  Min.   : 0.000                        
##  1st Qu.: 0.000                        
##  Median : 1.000                        
##  Mean   : 1.193                        
##  3rd Qu.: 2.000                        
##  Max.   :16.000                        
##                                        
##  22nd_&_i_st_nw_/_foggy_bottom_departures kennedy_center_arrivals
##  Min.   : 0.000                           Min.   : 0.0000        
##  1st Qu.: 0.000                           1st Qu.: 0.0000        
##  Median : 0.000                           Median : 0.0000        
##  Mean   : 1.189                           Mean   : 0.5179        
##  3rd Qu.: 2.000                           3rd Qu.: 1.0000        
##  Max.   :14.000                           Max.   :16.0000        
##                                                                  
##  kennedy_center_departures 18th_st_&_pennsylvania_ave_nw_departures
##  Min.   : 0.0000           Min.   : 0.000                          
##  1st Qu.: 0.0000           1st Qu.: 0.000                          
##  Median : 0.0000           Median : 0.000                          
##  Mean   : 0.4896           Mean   : 0.518                          
##  3rd Qu.: 0.0000           3rd Qu.: 1.000                          
##  Max.   :14.0000           Max.   :24.000                          
##                                                                    
##  22nd_st_&_constitution_ave_nw_departures 23rd_&_e_st_nw_departures
##  Min.   : 0.0000                          Min.   : 0.0000          
##  1st Qu.: 0.0000                          1st Qu.: 0.0000          
##  Median : 0.0000                          Median : 0.0000          
##  Mean   : 0.5878                          Mean   : 0.7135          
##  3rd Qu.: 1.0000                          3rd Qu.: 1.0000          
##  Max.   :33.0000                          Max.   :13.0000          
##                                                                    
##  lincoln_memorial_departures 18th_&_c_st_nw_departures
##  Min.   : 0.000              Min.   : 0.0000          
##  1st Qu.: 0.000              1st Qu.: 0.0000          
##  Median : 1.000              Median : 0.0000          
##  Mean   : 3.378              Mean   : 0.3965          
##  3rd Qu.: 5.000              3rd Qu.: 0.0000          
##  Max.   :58.000              Max.   :30.0000          
##                                                       
##  21st_&_i_st_nw_departures new_hampshire_ave_&_24th_st_nw_departures
##  Min.   : 0.0000           Min.   : 0.000                           
##  1st Qu.: 0.0000           1st Qu.: 0.000                           
##  Median : 0.0000           Median : 1.000                           
##  Mean   : 0.7431           Mean   : 1.194                           
##  3rd Qu.: 1.0000           3rd Qu.: 2.000                           
##  Max.   :20.0000           Max.   :12.000                           
##                                                                     
##  ohio_dr_&_west_basin_dr_sw_/_mlk_&_fdr_memorials_arrivals
##  Min.   : 0.000                                           
##  1st Qu.: 0.000                                           
##  Median : 0.000                                           
##  Mean   : 1.328                                           
##  3rd Qu.: 2.000                                           
##  Max.   :46.000                                           
##                                                           
##  ohio_dr_&_west_basin_dr_sw_/_mlk_&_fdr_memorials_departures
##  Min.   : 0.000                                             
##  1st Qu.: 0.000                                             
##  Median : 0.000                                             
##  Mean   : 1.336                                             
##  3rd Qu.: 2.000                                             
##  Max.   :51.000                                             
##                                                             
##  us_dept_of_state_/_virginia_ave_&_21st_st_nw_departures
##  Min.   : 0.0000                                        
##  1st Qu.: 0.0000                                        
##  Median : 0.0000                                        
##  Mean   : 0.4301                                        
##  3rd Qu.: 0.0000                                        
##  Max.   :16.0000                                        
##                                                         
##  15th_st_&_constitution_ave_nw_arrivals 17th_st_&_independence_ave_sw_arrivals
##  Min.   : 0.000                         Min.   : 0.000                        
##  1st Qu.: 0.000                         1st Qu.: 0.000                        
##  Median : 0.000                         Median : 0.000                        
##  Mean   : 1.966                         Mean   : 1.686                        
##  3rd Qu.: 3.000                         3rd Qu.: 2.000                        
##  Max.   :37.000                         Max.   :45.000                        
##                                                                               
##  18th_&_c_st_nw_arrivals 19th_st_&_pennsylvania_ave_nw_arrivals
##  Min.   : 0.0000         Min.   : 0.0000                       
##  1st Qu.: 0.0000         1st Qu.: 0.0000                       
##  Median : 0.0000         Median : 0.0000                       
##  Mean   : 0.4145         Mean   : 0.3001                       
##  3rd Qu.: 0.0000         3rd Qu.: 0.0000                       
##  Max.   :17.0000         Max.   :10.0000                       
##                                                                
##  19th_&_e_street_nw_departures jefferson_dr_&_14th_st_sw_arrivals
##  Min.   : 0.00000              Min.   : 0.000                    
##  1st Qu.: 0.00000              1st Qu.: 0.000                    
##  Median : 0.00000              Median : 1.000                    
##  Mean   : 0.09141              Mean   : 2.947                    
##  3rd Qu.: 0.00000              3rd Qu.: 4.000                    
##  Max.   :10.00000              Max.   :44.000                    
##                                                                  
##  lincoln_memorial_arrivals 20th_&_e_st_nw_departures
##  Min.   : 0.000            Min.   : 0.0000          
##  1st Qu.: 0.000            1st Qu.: 0.0000          
##  Median : 1.000            Median : 0.0000          
##  Mean   : 3.153            Mean   : 0.5774          
##  3rd Qu.: 4.000            3rd Qu.: 1.0000          
##  Max.   :49.000            Max.   :18.0000          
##                                                     
##  17th_st_&_independence_ave_sw_departures 19th_&_e_street_nw_arrivals
##  Min.   : 0.00                            Min.   :0.00000            
##  1st Qu.: 0.00                            1st Qu.:0.00000            
##  Median : 0.00                            Median :0.00000            
##  Mean   : 1.74                            Mean   :0.09222            
##  3rd Qu.: 2.00                            3rd Qu.:0.00000            
##  Max.   :44.00                            Max.   :9.00000            
##                                                                      
##  20th_st_&_virginia_ave_nw_arrivals 20th_st_&_virginia_ave_nw_departures
##  Min.   : 0.0000                    Min.   : 0.0000                     
##  1st Qu.: 0.0000                    1st Qu.: 0.0000                     
##  Median : 0.0000                    Median : 0.0000                     
##  Mean   : 0.2013                    Mean   : 0.1833                     
##  3rd Qu.: 0.0000                    3rd Qu.: 0.0000                     
##  Max.   :15.0000                    Max.   :16.0000                     
##                                                                         
##  19th_&_k_st_nw_arrivals 19th_&_k_st_nw_departures
##  Min.   :0.000000        Min.   :0.000000         
##  1st Qu.:0.000000        1st Qu.:0.000000         
##  Median :0.000000        Median :0.000000         
##  Mean   :0.002615        Mean   :0.002514         
##  3rd Qu.:0.000000        3rd Qu.:0.000000         
##  Max.   :1.000000        Max.   :1.000000         
##                                                   
##  crystal_dr_&_23rd_st_s_arrivals
##  Min.   :0.0000000              
##  1st Qu.:0.0000000              
##  Median :0.0000000              
##  Mean   :0.0001006              
##  3rd Qu.:0.0000000              
##  Max.   :1.0000000              
##                                 
##  smithsonian-national_mall_/_jefferson_dr_&_12th_st_sw_arrivals
##  Min.   :0.0000000                                             
##  1st Qu.:0.0000000                                             
##  Median :0.0000000                                             
##  Mean   :0.0001006                                             
##  3rd Qu.:0.0000000                                             
##  Max.   :1.0000000                                             
##                                                                
##  7th_&_t_st_nw_arrivals crystal_dr_&_15th_st_s_arrivals
##  Min.   :0              Min.   :0.0000000              
##  1st Qu.:0              1st Qu.:0.0000000              
##  Median :0              Median :0.0000000              
##  Mean   :0              Mean   :0.0001006              
##  3rd Qu.:0              3rd Qu.:0.0000000              
##  Max.   :0              Max.   :1.0000000              
##                                                        
##  20th_&_o_st_nw_/_dupont_south_arrivals eads_st_&_12th_st_s_arrivals
##  Min.   :0                              Min.   :0.0000000           
##  1st Qu.:0                              1st Qu.:0.0000000           
##  Median :0                              Median :0.0000000           
##  Mean   :0                              Mean   :0.0001006           
##  3rd Qu.:0                              3rd Qu.:0.0000000           
##  Max.   :0                              Max.   :1.0000000           
##                                                                     
##  iwo_jima_memorial_/_meade_&_14th_st_n_arrivals potomac_&_m_st_nw_arrivals
##  Min.   :0.0000000                              Min.   :0.0000000         
##  1st Qu.:0.0000000                              1st Qu.:0.0000000         
##  Median :0.0000000                              Median :0.0000000         
##  Mean   :0.0001006                              Mean   :0.0003017         
##  3rd Qu.:0.0000000                              3rd Qu.:0.0000000         
##  Max.   :1.0000000                              Max.   :2.0000000         
##                                                                           
##  gravelly_point_arrivals 21st_st_&_g_st_nw_arrivals
##  Min.   :0.0000000       Min.   : 0.0000           
##  1st Qu.:0.0000000       1st Qu.: 0.0000           
##  Median :0.0000000       Median : 0.0000           
##  Mean   :0.0001006       Mean   : 0.4806           
##  3rd Qu.:0.0000000       3rd Qu.: 0.0000           
##  Max.   :1.0000000       Max.   :18.0000           
##                                                    
##  21st_st_&_g_st_nw_departures maine_ave_&_9th_st_sw_arrivals
##  Min.   : 0.0000              Min.   :0.0000000             
##  1st Qu.: 0.0000              1st Qu.:0.0000000             
##  Median : 0.0000              Median :0.0000000             
##  Mean   : 0.4757              Mean   :0.0002011             
##  3rd Qu.: 0.0000              3rd Qu.:0.0000000             
##  Max.   :15.0000              Max.   :1.0000000             
##                                                             
##  roosevelt_island_arrivals n_lynn_st_&_fairfax_dr_arrivals
##  Min.   :0.0000000         Min.   :0                      
##  1st Qu.:0.0000000         1st Qu.:0                      
##  Median :0.0000000         Median :0                      
##  Mean   :0.0001006         Mean   :0                      
##  3rd Qu.:0.0000000         3rd Qu.:0                      
##  Max.   :1.0000000         Max.   :0                      
##                                                           
##  4th_st_&_g_st_sw_arrivals 20th_&_l_st_nw_arrivals
##  Min.   :0.0000000         Min.   :0.0000000      
##  1st Qu.:0.0000000         1st Qu.:0.0000000      
##  Median :0.0000000         Median :0.0000000      
##  Mean   :0.0001006         Mean   :0.0001006      
##  3rd Qu.:0.0000000         3rd Qu.:0.0000000      
##  Max.   :1.0000000         Max.   :1.0000000      
##                                                   
##  15th_st_&_pennsylvania_ave_nw_arrivals
##  Min.   :0.0000000                     
##  1st Qu.:0.0000000                     
##  Median :0.0000000                     
##  Mean   :0.0001006                     
##  3rd Qu.:0.0000000                     
##  Max.   :1.0000000                     
##                                        
##  woodley_park_metro_/_calvert_st_&_connecticut_ave_nw_arrivals
##  Min.   :0.0000000                                            
##  1st Qu.:0.0000000                                            
##  Median :0.0000000                                            
##  Mean   :0.0001006                                            
##  3rd Qu.:0.0000000                                            
##  Max.   :1.0000000                                            
##                                                               
##  15th_&_p_st_nw_arrivals reservoir_rd_&_38th_st_nw_arrivals
##  Min.   :0.0000000       Min.   :0.0000000                 
##  1st Qu.:0.0000000       1st Qu.:0.0000000                 
##  Median :0.0000000       Median :0.0000000                 
##  Mean   :0.0001006       Mean   :0.0001006                 
##  3rd Qu.:0.0000000       3rd Qu.:0.0000000                 
##  Max.   :1.0000000       Max.   :1.0000000                 
##                                                            
##  3000_connecticut_ave_nw_/_national_zoo_arrivals white_house_departures
##  Min.   :0.0000000                               Min.   :0.000000      
##  1st Qu.:0.0000000                               1st Qu.:0.000000      
##  Median :0.0000000                               Median :0.000000      
##  Mean   :0.0001006                               Mean   :0.002011      
##  3rd Qu.:0.0000000                               3rd Qu.:0.000000      
##  Max.   :1.0000000                               Max.   :3.000000      
##                                                                        
##  white_house_arrivals stadium_armory_metro_arrivals maximum_temperature
##  Min.   :0.000000     Min.   :0.0000000             Min.   :23.10      
##  1st Qu.:0.000000     1st Qu.:0.0000000             1st Qu.:50.85      
##  Median :0.000000     Median :0.0000000             Median :66.80      
##  Mean   :0.002212     Mean   :0.0001006             Mean   :63.73      
##  3rd Qu.:0.000000     3rd Qu.:0.0000000             3rd Qu.:76.80      
##  Max.   :3.000000     Max.   :1.0000000             Max.   :98.00      
##                                                     NA's   :1          
##  minimum_temperature  temperature      wind_chill      heat_index   
##  Min.   :23.10       Min.   :23.10   Min.   :10.60   Min.   : 79.3  
##  1st Qu.:50.85       1st Qu.:50.85   1st Qu.:27.62   1st Qu.: 84.9  
##  Median :66.80       Median :66.80   Median :34.00   Median : 88.8  
##  Mean   :63.73       Mean   :63.73   Mean   :33.72   Mean   : 89.4  
##  3rd Qu.:76.80       3rd Qu.:76.80   3rd Qu.:40.70   3rd Qu.: 93.2  
##  Max.   :98.00       Max.   :98.00   Max.   :49.40   Max.   :108.3  
##  NA's   :1           NA's   :1       NA's   :7790    NA's   :8316   
##  precipitation           snow             snow_depth       wind_speed    
##  Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000   1st Qu.: 4.800  
##  Median :0.000000   Median :0.0000000   Median :0.0000   Median : 7.600  
##  Mean   :0.006046   Mean   :0.0008609   Mean   :0.0316   Mean   : 8.043  
##  3rd Qu.:0.000000   3rd Qu.:0.0000000   3rd Qu.:0.0000   3rd Qu.:10.700  
##  Max.   :1.030000   Max.   :1.1300000   Max.   :2.9000   Max.   :36.400  
##  NA's   :1          NA's   :1           NA's   :1        NA's   :1       
##  wind_direction    wind_gust       visibility      cloud_cover    
##  Min.   :  0.0   Min.   :16.10   Min.   : 0.000   Min.   :  0.00  
##  1st Qu.:107.0   1st Qu.:20.80   1st Qu.: 9.900   1st Qu.: 33.10  
##  Median :191.0   Median :24.15   Median : 9.900   Median : 69.70  
##  Mean   :191.2   Mean   :25.36   Mean   : 9.491   Mean   : 61.72  
##  3rd Qu.:301.0   3rd Qu.:29.60   3rd Qu.: 9.900   3rd Qu.: 89.40  
##  Max.   :360.0   Max.   :57.50   Max.   :10.000   Max.   :100.00  
##  NA's   :3       NA's   :8334    NA's   :1        NA's   :1       
##  relative_humidity  conditions          sun_is_out    
##  Min.   : 14.19    Length:9944        Min.   :0.0000  
##  1st Qu.: 50.60    Class :character   1st Qu.:0.0000  
##  Median : 66.71    Mode  :character   Median :1.0000  
##  Mean   : 65.29                       Mean   :0.5707  
##  3rd Qu.: 81.11                       3rd Qu.:1.0000  
##  Max.   :100.00                       Max.   :1.0000  
##  NA's   :1                                            
##       date                    
##  Min.   :2020-05-01 01:00:00  
##  1st Qu.:2020-09-08 07:45:00  
##  Median :2021-01-14 10:30:00  
##  Mean   :2021-01-14 15:02:20  
##  3rd Qu.:2021-05-24 00:15:00  
##  Max.   :2021-09-30 23:00:00  
## 
# summary lincoln_memorial_departures
train_data_date %>%
  summarize(
    mean(lincoln_memorial_departures),
    max(lincoln_memorial_departures),
    min(lincoln_memorial_departures),
    sd(lincoln_memorial_departures)) 
## # A tibble: 1 × 4
##   `mean(lincoln_memo… `max(lincoln_memor… `min(lincoln_memor… `sd(lincoln_memor…
##                 <dbl>               <dbl>               <dbl>              <dbl>
## 1                3.38                  58                   0               5.38
# how can we get better summary stats? maybe not split by hour?
# we can group the data, and sum if we need to
# check how much NA in temperature variable
# sum(is.na(train_data_date$temperature)) #0
# sum(is.na(train_data_date$cloud_cover)) #0

train_data_date %>%
 # mutate(weekday = wday(date, label=TRUE)) %>%
  group_by(month) %>%
  summarize(mean(temperature), max(temperature), min(temperature), sd(temperature)) 
## # A tibble: 12 × 5
##    month `mean(temperature)` `max(temperature… `min(temperatur… `sd(temperature…
##    <chr>               <dbl>             <dbl>            <dbl>            <dbl>
##  1 Apr                  57.8              85.7             30.7            10.6 
##  2 Aug                  78.8              96               65.5             5.85
##  3 Dec                  40.7              65.9             23.1             7.58
##  4 Feb                  37.2              59.4             24.8             7.40
##  5 Jan                  38.1              56.9             23.7             6.10
##  6 Jul                  81.5              98               62.9             6.56
##  7 Jun                  76.1              93.9             55.3             7.47
##  8 Mar                  NA                NA               NA              NA   
##  9 May                  64.4              92.7             37              10.2 
## 10 Nov                  53.3              74.3             31.5             8.23
## 11 Oct                  61.0              80.1             40.8             7.55
## 12 Sep                  71.1              89.9             46.7             7.67
# density plot (no log)
train_data_date %>%
  filter(lincoln_memorial_departures != 0) %>% # notice that we here are dropping 0s
  ggplot(aes(x = lincoln_memorial_departures)) +
  geom_density() +
  theme_minimal() +
  labs(title = "xxx", 
       x = "Number of departures from Lincoln Memorial")

# What does the number 50 entail?

# density plot (log)
train_data_date %>%
  filter(lincoln_memorial_departures != 0) %>% # notice that we here are dropping 0s
  ggplot(aes(x = log(lincoln_memorial_departures))) + # add log to fix skewness
  geom_density() +
  theme_minimal() +
  labs(title = "xxx", 
       x = "Number of departures from Lincoln Memorial (logged)")

# What does the number 4 entail?

Visualizations by time at Lincoln Memorial

In this part, we use Lincoln Memorial as an example to show the change in amount of rides over time.

train_data_date %>%  # for me here there is a weird gap after July 2020, not sure why?
  ggplot(aes(x = date, y = lincoln_memorial_departures)) +
  geom_point(alpha = 0.1, color = "orange") + 
  geom_smooth() +
  labs(title = "January has the least demand of capital bikes",
       x = "Date", 
       y = "The Number of Departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# by weekday
train_data_date %>% 
  mutate(weekday = wday(date, label=TRUE)) %>% 
  ggplot(aes(x = weekday, y = lincoln_memorial_departures)) +
  geom_point(alpha = 0.1) + # clearly people cycle more on Saturdays and Sundays
  geom_smooth() +
  labs(title = "At Lincoln Memorial, Capitals bikes are mostly used for recreational purpose",
       subtitle = "Weekend has the highest demand",
       x = "Weekday (Aggregate)", 
       y = "The Number of Departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# farah's code here although similar to first one here?
#hour_data_date %>%
  #mutate(weekday = wday(date, label=TRUE)) %>%
  #ggplot(aes(x = weekday, y = lincoln_memorial)) +
  #geom_point(alpha = 0.25) +
  #labs(title = "Capital bikeshare Ridership on Week days") +
  #theme_minimal()
# by day of the month
train_data_date %>% 
  ggplot(aes(x = day, y = lincoln_memorial_departures)) +
  geom_point(alpha = 0.1, color = "salmon") + # there is no clear pattern across days of the month - aka probably not a good predictor
  geom_smooth() + 
  facet_wrap(~month, scales = "free_y") +
  labs(title = "At Lincoln Memorial, demand for bikes is more fluctuated in tourist season",
       x = "Day of a month", 
       y = "The Number of Departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# by hour
train_data_date %>%
  ggplot(aes(x = factor(hour), y = lincoln_memorial_departures)) +
  geom_col(fill = "deepskyblue2", alpha = 0.8) +
  geom_smooth() +
  theme_minimal() +
  labs(title = "At Lincoln Memorial, bikes are most demanded in the afternoon from 12pm-6pm",
       subtitle = "The highest demand occurs at 2pm",
       x = "Hour (Aggregate)", 
       y = "The Number of Departures at Lincoln Memorial")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# for a specific month and day, seeing variance by hour
train_data_date %>% 
  filter(year == 2021, month == "Aug", day == 15) %>% 
  ggplot(aes(x = factor(hour), y = lincoln_memorial_departures)) +
  geom_col(fill = "Red", alpha = 0.5) + 
  theme_minimal() +
  labs(title = "On Aug 15, the highest demand occured at 12pm and 2pm",
       x = "Hour",
       y = "The Number of Departures at Lincoln Memorial")

# by hour & month
train_data_date %>%
  ggplot(aes(x = hour, y = lincoln_memorial_departures)) +
  geom_point(alpha = 0.1, color = "dark green") + # people cycle more in the middle of the day, like between 1 and 7PM
  geom_smooth() +
  facet_wrap(~month)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# add month order
order_month <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")

# by month
train_data_date %>% 
  ggplot(aes(x = month, y = lincoln_memorial_departures), level = order_month) +
  geom_point(alpha = 0.1, color = "dark red") + 
  scale_x_discrete(limits = month.abb) +
  # people cycle most in Spring, summer and fall. Surprised that there is no big spike in summer months
  geom_smooth() +
  theme_minimal() +
  labs(title = "At Lincoln Memorial, people cycle the most in Spring",
       x = "Month (Aggregate)",
       y = "The Number of Departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# by year
train_data_date %>% 
  ggplot(aes(x = factor(year), y = lincoln_memorial_departures)) +
  geom_col(width = 0.3, color = "pink") + 
  geom_smooth() +
  theme_minimal() +
  labs(title = "At Lincoln Memorial, more people use the bike in 2021 than in 2020",
       x = "Year (Aggregate)",
       y = "The Number of Departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#prep training data for holiday barchart - 1 year of data from May 2020 to April 2021
bar_date <- train_data_date %>%
  filter(year==2020 | year==2021 & month== "Jan" | year==2021 & month== "Feb"|year==2021 & month== "Mar"|year==2021 & month== "Apr") %>%
  group_by(day, month, year)%>%
  summarise(lincoln_memorial_departures=sum(lincoln_memorial_departures), n())
## `summarise()` has grouped output by 'day', 'month'. You can override using the `.groups` argument.
## confirmed: 365 days exist in this data frame but not everyday has 24 hours because the training data does not have the full data

# check for missing data
sum(is.na(bar_date$lincoln_memorial_departures)) #0
## [1] 0
# what is the mean departures per day?
mean_day <- bar_date %>%
  group_by(day) %>%
  summarize(n= mean(lincoln_memorial_departures)) %>%
  summarize(mean(n))  ## mean per day is 55.6
              
# create a data frame for holidays
holidays <- generated_holidays %>%
  filter(country == "US") 

holidays$ds <- as_date(ymd(holidays$ds)) 

holidays <- holidays %>%
  mutate(
    day = day(ymd(ds)),
    month = month(ymd(ds), label = TRUE)
                     ) %>%
  filter(year==2020 | year==2021) 

holidays_date <- left_join(bar_date, holidays, by= c('year', 'month', 'day')) %>%
    na.omit(holidays) 

holidays_date %>%
  ggplot(aes(y= holiday, x= lincoln_memorial_departures , fill=lincoln_memorial_departures)) + 
  geom_col()+
  geom_vline(xintercept = 55.6, color = "red") + 
  xlab("") +
  ylab("Number of departures") +
  scale_fill_gradient(low = "#FFFFFFFF",
                      high = "#012345") +
  #scale_fill_gradient2() +
  #scale_colour_discrete("Holidays")+
  #scale_fill_manual(c("#FF6666")) +

  guides(fill=guide_legend(title="Number of departures")) +
  theme_bw() +
  labs(title = "Demand is highest among holidays such as Independence Day, Labor Day and Memorial Day",
       x = "",
       y = "The number of departures from Licoln Memorial") 

 ggsave("holidaybar.jpg", width = 50, height = 20, units = "cm")

Visualizations by weather at Lincoln Memorial

In this part, we visualize the demand of bikes at the Lincoln Memorial station by different whether conditions.

# weather viz
train_data_date %>%
  ggplot(aes(x = temperature, y = lincoln_memorial_departures))+
  geom_point(alpha = 0.1,color = "orange") +
  labs(x = 'Temperature', y = 'Hourly Departures and Arrivals')+
  geom_smooth() +
  theme_minimal() +
  labs(title = "As the temperature goes up, departure also goes up",
       subtitle = "There is a linear relationship between demand and temperature",
       x = "Temperature (Fahrenheit)",
       y = "Aggregate number of departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

# temperature in each month
train_data_date %>%
  ggplot(aes(x = temperature, y = lincoln_memorial_departures))+
  geom_point(alpha = 0.07, color = "orange") +
  geom_smooth() +
  facet_wrap(~month) +
  theme_light() +
  labs(title = "As the temperature goes up, demand for bikes also goes up",
       subtitle = "There is a rough linear relationship between temperature demand except August",
       x = "Temperature (Fahrenheit)",
       y = "Aggregate number of departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# wind viz
train_data_date %>%
  ggplot(aes(x = wind_speed,y = lincoln_memorial_departures))+
  geom_point(alpha = 0.1,color = "deepskyblue2") +
  geom_smooth() + # data is sparse hence why we get this weird trend
  theme_minimal() +
  labs(title = "Demand for bikes is not sensitive to wind speed",
       subtitle = "No clear relationship is shown between wind speed and bike demand",
       x = "Wind Speed (miles per minute)",
       y = "Aggregate number of departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# rain viz
train_data_date %>%
  filter(precipitation != 0) %>% 
  #removing 0 to better show relationship
  ggplot(aes(x = precipitation, y = lincoln_memorial_departures))+
  geom_point(alpha = 0.1,color = "blue") +
  geom_smooth() +
  theme_minimal() +
  labs(title = "Negative relationship between bike demand and precipitation",
       subtitle = "No clear relationship is shown between wind speed and bike demand",
       x = "Precipitation (include drizzling, rain, sleet, snow, ice pellets, graupel and hail)",
       y = "Aggregate number of departures at Lincoln Memorial")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# data very sparse hence why weird trend. Aaron recommended also creating a variable that dychotomizes rain. 1 if it rains 0 if it doesn't.
#cloud viz
train_data_date %>%
  ggplot(aes(x = cloud_cover,y = lincoln_memorial_departures)) +
  geom_point(alpha = 0.5, color = "lightblue") +
  geom_smooth() +
  theme_minimal() +
  labs(title = "Demand for bikes is not sensitive to cloud coverage",
       subtitle = "No clear relationship is shown between cloud and bike demand",
       x = "Cloud Cover",
       y = "Aggregate number of departures at Lincoln Memorial")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

#sunlight
train_data_date %>%
  ggplot(aes(x = factor(sun_is_out), y = lincoln_memorial_departures)) +
  geom_col(color = "lightgoldenrod1", width = 0.3) +
  geom_smooth() +
  theme_bw() +
  facet_wrap(~month) +
  labs(title = "Passengers have high preference to ride bikes during daytime",
       subtitle = "While in cold seasons, the gap in demand shrinks drastically",
       x = "Sunlight (sun is out = 1, otherwise = 0)",
       y = "Aggregate number of departures at Lincoln Memorial")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
heatmap <- train_data_date %>%
  select (hour, lincoln_memorial_departures)  %>%
    group_by (hour) %>%
  summarize (sum_rider =sum(lincoln_memorial_departures))

#reshape data
data_melt <- melt(heatmap, id= "hour")

# plot the graph  
data_melt %>%
 ggplot(aes(hour, variable)) +
    geom_tile(aes(fill = value)) +
    labs(x = NULL,
         y = NULL) + 
  scale_fill_gradientn(colors = "blue") +
 #scale_fill_gradientn(colours = rainbow(10)) +
 #scale_fill_gradient(low = "white", mid = "blue", high = "black")+
  scale_fill_gradient(low = "#FFFFFFFF",
                    high = "#012345")  +
    theme(legend.position = "right",
          legend.direction = "vertical",
          axis.line.x = element_blank(),
          panel.grid.major.y = element_blank()) 
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.